library(tidyverse)
library(ggplot2)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)
library(VSURF)VSURF
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(c(`Family Respect`:`Togetherness`,`Close-knit Community`:`Community Trust`),~relevel(.x,ref="Strongly disagree"))) |>
mutate(across(c(`See Family`:`Helpful Friends`,`Discrimination`),~as.factor(.x)))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Family
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Family~.,
data=imbal_train,
seed=3)$data
VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 39.2 secs
VSURF selected:
34 variables at thresholding step (in 6.7 secs)
21 variables at interpretation step (in 6.2 secs)
15 variables at prediction step (in 26.3 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Ethnicity" "Age" "Religion"
[4] "English.Difficulties" "See.Family" "Helpful.Family"
[7] "See.Friends" "Close.Family" "Helpful.Friends"
[10] "Close.Friends" "Discrimination" "Religious.Attendance"
[13] "Gender" "Religious.Importance" "Close.knit.Community"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Ethnicity" "Age"
[3] "Religion" "English.Difficulties"
[5] "See.Family" "Helpful.Family"
[7] "See.Friends" "Close.Family"
[9] "Helpful.Friends" "Close.Friends"
[11] "Discrimination" "Religious.Attendance"
[13] "Gender" "Religious.Importance"
[15] "English.Speaking" "Full.Time.Employment"
[17] "Close.knit.Community" "Helpful.Community"
[19] "Community.Shares.Values" "Community.Trust"
[21] "Successful.Family"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1599222
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
) |>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Ethnicity 0.07224 0.00141 red
2 Age 0.04941 0.00120 black
3 Religion 0.03771 0.00113 black
4 English.Difficulties 0.03751 0.00101 black
5 See.Family 0.03537 0.00073 black
6 Helpful.Family 0.03532 0.00097 black
7 See.Friends 0.03530 0.00073 black
8 Close.Family 0.03400 0.00069 black
9 Helpful.Friends 0.03387 0.00085 black
10 Close.Friends 0.03364 0.00087 black
11 Discrimination 0.03156 0.00115 black
12 Religious.Attendance 0.02780 0.00064 black
13 Gender 0.02591 0.00098 black
14 Religious.Importance 0.02258 0.00088 black
15 English.Speaking 0.02221 0.00071 black
16 Full.Time.Employment 0.02172 0.00109 black
17 Close.knit.Community 0.02128 0.00084 black
18 Helpful.Community 0.02127 0.00074 black
19 Community.Shares.Values 0.02075 0.00084 black
20 Community.Trust 0.02038 0.00062 black
21 Successful.Family 0.02009 0.00085 black
22 Get.Along 0.01972 0.00080 black
23 Expression 0.01349 0.00064 black
24 Similar.Values 0.01305 0.00068 black
25 Togetherness 0.01194 0.00077 black
26 Dental.Insurance 0.01190 0.00094 black
27 Spend.Time.Together 0.01092 0.00060 black
28 Family.Respect 0.00913 0.00046 black
29 Feel.Close 0.00911 0.00057 black
30 Trust 0.00891 0.00041 black
31 Income_median 0.00878 0.00043 black
32 Family.Pride 0.00649 0.00040 black
33 Loyalty 0.00601 0.00037 black
34 Health.Insurance 0.00359 0.00024 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_family.png", width=8, height=12,units="in")Prediction
pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 110 84
Yes 75 115
Accuracy : 0.5859
95% CI : (0.5349, 0.6357)
No Information Rate : 0.5182
P-Value [Acc > NIR] : 0.004511
Kappa : 0.1722
Mcnemar's Test P-Value : 0.525793
Sensitivity : 0.5779
Specificity : 0.5946
Pos Pred Value : 0.6053
Neg Pred Value : 0.5670
Prevalence : 0.5182
Detection Rate : 0.2995
Detection Prevalence : 0.4948
Balanced Accuracy : 0.5862
'Positive' Class : Yes
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobjSetting levels: control = No, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)Area under the curve: 0.5862
Health Professionals
rfdata <- qol |> select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Heal.Professionals~.,
data=imbal_train,
seed=3)$data
VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 16.8 secs
VSURF selected:
34 variables at thresholding step (in 6.4 secs)
16 variables at interpretation step (in 5.6 secs)
2 variables at prediction step (in 4.8 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "English.Speaking" "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "English.Speaking" "Religion" "Ethnicity"
[4] "English.Difficulties" "Dental.Insurance" "See.Friends"
[7] "Close.Family" "Helpful.Friends" "See.Family"
[10] "Close.Friends" "Helpful.Family" "Age"
[13] "Helpful.Community" "Income_median" "Community.Trust"
[16] "Religious.Attendance"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1384565
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
) |>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 English.Speaking 0.07557 0.00207 black
2 Religion 0.06309 0.00155 black
3 Ethnicity 0.06005 0.00143 red
4 English.Difficulties 0.04423 0.00159 black
5 Dental.Insurance 0.04375 0.00248 black
6 See.Friends 0.04085 0.00105 black
7 Close.Family 0.03763 0.00113 black
8 Helpful.Friends 0.03613 0.00090 black
9 See.Family 0.03599 0.00073 black
10 Close.Friends 0.03330 0.00086 black
11 Helpful.Family 0.03172 0.00089 black
12 Age 0.02875 0.00080 black
13 Helpful.Community 0.02442 0.00089 black
14 Income_median 0.02420 0.00173 black
15 Community.Trust 0.02263 0.00070 black
16 Religious.Attendance 0.02168 0.00042 black
17 Religious.Importance 0.02145 0.00066 black
18 Get.Along 0.02125 0.00092 black
19 Close.knit.Community 0.02055 0.00070 black
20 Community.Shares.Values 0.01886 0.00067 black
21 Health.Insurance 0.01233 0.00069 black
22 Family.Respect 0.01034 0.00053 black
23 Trust 0.01006 0.00056 black
24 Similar.Values 0.00991 0.00047 black
25 Expression 0.00967 0.00037 black
26 Spend.Time.Together 0.00967 0.00048 black
27 Successful.Family 0.00954 0.00047 black
28 Feel.Close 0.00936 0.00042 black
29 Discrimination 0.00865 0.00043 black
30 Family.Pride 0.00845 0.00040 black
31 Togetherness 0.00797 0.00034 black
32 Gender 0.00686 0.00031 black
33 Loyalty 0.00666 0.00030 black
34 Full.Time.Employment 0.00605 0.00039 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
# coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_hp.png", width=8, height=12,units="in")Prediction
pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 100 73
Yes 85 127
Accuracy : 0.5896
95% CI : (0.5386, 0.6392)
No Information Rate : 0.5195
P-Value [Acc > NIR] : 0.003353
Kappa : 0.176
Mcnemar's Test P-Value : 0.381512
Sensitivity : 0.6350
Specificity : 0.5405
Pos Pred Value : 0.5991
Neg Pred Value : 0.5780
Prevalence : 0.5195
Detection Rate : 0.3299
Detection Prevalence : 0.5506
Balanced Accuracy : 0.5878
'Positive' Class : Yes
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobjSetting levels: control = No, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)Area under the curve: 0.5878
Physical Check-up
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Physical.Check.up~.,
data=imbal_train,
seed=3)$data
VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 37.3 secs
VSURF selected:
34 variables at thresholding step (in 6 secs)
26 variables at interpretation step (in 5.1 secs)
18 variables at prediction step (in 26.2 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Dental.Insurance" "Age" "Health.Insurance"
[4] "Ethnicity" "Helpful.Family" "Religion"
[7] "Gender" "See.Family" "EnglishDiff"
[10] "See.Friends" "Income_median" "Close.Family"
[13] "Helpful.Friends" "Close.Friends" "Religious.Importance"
[16] "EnglishSpeak" "Helpful.Community" "Spend.Time.Together"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Dental.Insurance" "Age"
[3] "Health.Insurance" "Ethnicity"
[5] "Helpful.Family" "Religion"
[7] "Gender" "See.Family"
[9] "EnglishDiff" "See.Friends"
[11] "Income_median" "Close.Family"
[13] "Helpful.Friends" "Close.Friends"
[15] "Religious.Importance" "EnglishSpeak"
[17] "Close.knit.Community" "Helpful.Community"
[19] "Religious.Attendance" "Community.Shares.Values"
[21] "Get.Along" "Community.Trust"
[23] "Discrimination" "Spend.Time.Together"
[25] "Employment" "Expression"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.09121094
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
) |>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.07270 0.00199 black
2 Age 0.06376 0.00151 black
3 Health.Insurance 0.05786 0.00155 black
4 Ethnicity 0.05548 0.00123 red
5 Helpful.Family 0.03567 0.00110 black
6 Religion 0.03456 0.00102 black
7 Gender 0.03405 0.00095 black
8 See.Family 0.03347 0.00078 black
9 EnglishDiff 0.03337 0.00113 black
10 See.Friends 0.03066 0.00084 black
11 Income_median 0.02859 0.00136 black
12 Close.Family 0.02760 0.00063 black
13 Helpful.Friends 0.02711 0.00068 black
14 Close.Friends 0.02547 0.00070 black
15 Religious.Importance 0.02493 0.00086 black
16 EnglishSpeak 0.02400 0.00107 black
17 Close.knit.Community 0.02199 0.00068 black
18 Helpful.Community 0.02002 0.00097 black
19 Religious.Attendance 0.01943 0.00059 black
20 Community.Shares.Values 0.01868 0.00077 black
21 Get.Along 0.01606 0.00076 black
22 Community.Trust 0.01538 0.00061 black
23 Discrimination 0.01490 0.00080 black
24 Spend.Time.Together 0.01166 0.00054 black
25 Employment 0.01088 0.00077 black
26 Expression 0.00994 0.00040 black
27 Similar.Values 0.00927 0.00033 black
28 Trust 0.00792 0.00039 black
29 Feel.Close 0.00734 0.00038 black
30 Family.Pride 0.00691 0.00041 black
31 Family.Respect 0.00682 0.00025 black
32 Togetherness 0.00646 0.00029 black
33 Successful.Family 0.00599 0.00026 black
34 Loyalty 0.00586 0.00017 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
# coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_PC.png", width=8, height=12,units="in")Prediction
pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 71 69
Yes 51 191
Accuracy : 0.6859
95% CI : (0.6367, 0.7321)
No Information Rate : 0.6806
P-Value [Acc > NIR] : 0.4372
Kappa : 0.3047
Mcnemar's Test P-Value : 0.1207
Sensitivity : 0.7346
Specificity : 0.5820
Pos Pred Value : 0.7893
Neg Pred Value : 0.5071
Prevalence : 0.6806
Detection Rate : 0.5000
Detection Prevalence : 0.6335
Balanced Accuracy : 0.6583
'Positive' Class : Yes
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobjSetting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)Area under the curve: 0.6583
Dental Check-up
rfdata <- qol |> select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Dentist.Check.up~.,
data=imbal_train,
seed=3)$data
VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 39.9 secs
VSURF selected:
34 variables at thresholding step (in 6.2 secs)
31 variables at interpretation step (in 5.1 secs)
14 variables at prediction step (in 28.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Dental.Insurance" "Religion" "See.Friends"
[4] "Close.Friends" "Close.Family" "Helpful.Friends"
[7] "English.Speaking" "Helpful.Family" "English.Difficulties"
[10] "Religious.Importance" "Health.Insurance" "Community.Trust"
[13] "Successful.Family" "Get.Along"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Dental.Insurance" "Ethnicity"
[3] "Age" "Income_median"
[5] "Religion" "See.Family"
[7] "See.Friends" "Close.Friends"
[9] "Close.Family" "Helpful.Friends"
[11] "English.Speaking" "Helpful.Family"
[13] "English.Difficulties" "Religious.Importance"
[15] "Health.Insurance" "Community.Trust"
[17] "Community.Shares.Values" "Gender"
[19] "Close.knit.Community" "Religious.Attendance"
[21] "Successful.Family" "Get.Along"
[23] "Helpful.Community" "Similar.Values"
[25] "Feel.Close" "Spend.Time.Together"
[27] "Expression" "Full.Time.Employment"
[29] "Family.Respect" "Togetherness"
[31] "Loyalty"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1140248
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
) |>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.11404 0.00189 black
2 Ethnicity 0.05543 0.00167 red
3 Age 0.04809 0.00148 black
4 Income_median 0.04399 0.00227 black
5 Religion 0.04101 0.00122 black
6 See.Family 0.03607 0.00098 black
7 See.Friends 0.03576 0.00062 black
8 Close.Friends 0.03027 0.00065 black
9 Close.Family 0.03005 0.00072 black
10 Helpful.Friends 0.02874 0.00102 black
11 English.Speaking 0.02806 0.00101 black
12 Helpful.Family 0.02748 0.00082 black
13 English.Difficulties 0.02480 0.00098 black
14 Religious.Importance 0.02143 0.00064 black
15 Health.Insurance 0.02010 0.00126 black
16 Community.Trust 0.01850 0.00075 black
17 Community.Shares.Values 0.01781 0.00063 black
18 Gender 0.01712 0.00114 black
19 Close.knit.Community 0.01666 0.00054 black
20 Religious.Attendance 0.01605 0.00043 black
21 Successful.Family 0.01385 0.00067 black
22 Get.Along 0.01369 0.00040 black
23 Helpful.Community 0.01285 0.00046 black
24 Similar.Values 0.01165 0.00047 black
25 Feel.Close 0.01049 0.00043 black
26 Spend.Time.Together 0.00915 0.00045 black
27 Expression 0.00899 0.00053 black
28 Full.Time.Employment 0.00850 0.00073 black
29 Family.Respect 0.00786 0.00045 black
30 Togetherness 0.00671 0.00028 black
31 Loyalty 0.00666 0.00038 black
32 Discrimination 0.00645 0.00042 black
33 Trust 0.00622 0.00039 black
34 Family.Pride 0.00529 0.00029 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
# coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Dc.png", width=8, height=12,units="in")Prediction
pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 117 74
Yes 40 151
Accuracy : 0.7016
95% CI : (0.6529, 0.747)
No Information Rate : 0.589
P-Value [Acc > NIR] : 3.417e-06
Kappa : 0.4031
Mcnemar's Test P-Value : 0.001997
Sensitivity : 0.6711
Specificity : 0.7452
Pos Pred Value : 0.7906
Neg Pred Value : 0.6126
Prevalence : 0.5890
Detection Rate : 0.3953
Detection Prevalence : 0.5000
Balanced Accuracy : 0.7082
'Positive' Class : Yes
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobjSetting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)Area under the curve: 0.7082
Folkmedicine
rfdata <- qol |> select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Folkmedicine~.,
data=imbal_train,
seed=3)$data
VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 38.8 secs
VSURF selected:
34 variables at thresholding step (in 5.7 secs)
31 variables at interpretation step (in 6.1 secs)
15 variables at prediction step (in 27 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Ethnicity" "Helpful.Friends"
[3] "See.Family" "Close.Friends"
[5] "English.Speaking" "Helpful.Family"
[7] "Close.Family" "English.Difficulties"
[9] "Religious.Importance" "Helpful.Community"
[11] "Religious.Attendance" "Community.Shares.Values"
[13] "Get.Along" "Family.Pride"
[15] "Feel.Close"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Ethnicity" "Age"
[3] "Religion" "See.Friends"
[5] "Helpful.Friends" "See.Family"
[7] "Close.Friends" "English.Speaking"
[9] "Helpful.Family" "Close.Family"
[11] "English.Difficulties" "Religious.Importance"
[13] "Helpful.Community" "Religious.Attendance"
[15] "Community.Shares.Values" "Close.knit.Community"
[17] "Get.Along" "Community.Trust"
[19] "Family.Pride" "Spend.Time.Together"
[21] "Feel.Close" "Similar.Values"
[23] "Full.Time.Employment" "Gender"
[25] "Loyalty" "Family.Respect"
[27] "Expression" "Discrimination"
[29] "Successful.Family" "Trust"
[31] "Dental.Insurance"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.03628289
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
) |>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Ethnicity 0.14026 0.00194 red
2 Age 0.07429 0.00149 black
3 Religion 0.07297 0.00238 black
4 See.Friends 0.07109 0.00183 black
5 Helpful.Friends 0.05528 0.00142 black
6 See.Family 0.05156 0.00140 black
7 Close.Friends 0.05120 0.00110 black
8 English.Speaking 0.05071 0.00194 black
9 Helpful.Family 0.04559 0.00098 black
10 Close.Family 0.04244 0.00109 black
11 English.Difficulties 0.03898 0.00096 black
12 Religious.Importance 0.03875 0.00138 black
13 Helpful.Community 0.03472 0.00096 black
14 Religious.Attendance 0.03128 0.00106 black
15 Community.Shares.Values 0.03101 0.00137 black
16 Close.knit.Community 0.02941 0.00073 black
17 Get.Along 0.02930 0.00095 black
18 Community.Trust 0.02660 0.00086 black
19 Family.Pride 0.02386 0.00160 black
20 Spend.Time.Together 0.02028 0.00070 black
21 Feel.Close 0.02028 0.00139 black
22 Similar.Values 0.01895 0.00072 black
23 Full.Time.Employment 0.01841 0.00127 black
24 Gender 0.01679 0.00091 black
25 Loyalty 0.01669 0.00098 black
26 Family.Respect 0.01572 0.00100 black
27 Expression 0.01547 0.00062 black
28 Discrimination 0.01476 0.00083 black
29 Successful.Family 0.01404 0.00070 black
30 Trust 0.01356 0.00083 black
31 Dental.Insurance 0.01147 0.00041 black
32 Togetherness 0.01106 0.00041 black
33 Income_median 0.00966 0.00064 black
34 Health.Insurance 0.00565 0.00033 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
# coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Alt.png", width=8, height=12,units="in")Prediction
pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 291 43
Yes 37 8
Accuracy : 0.7889
95% CI : (0.7443, 0.8289)
No Information Rate : 0.8654
P-Value [Acc > NIR] : 1.0000
Kappa : 0.0464
Mcnemar's Test P-Value : 0.5762
Sensitivity : 0.15686
Specificity : 0.88720
Pos Pred Value : 0.17778
Neg Pred Value : 0.87126
Prevalence : 0.13456
Detection Rate : 0.02111
Detection Prevalence : 0.11873
Balanced Accuracy : 0.52203
'Positive' Class : Yes
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobjSetting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls < cases
pROC::auc(rocobj)Area under the curve: 0.522